Goal: Compare super claims 1, 3, and 5. 1 : Not Happening 3: Climate Impacts Not Bad 5: Science/Scientist Not Reliable

knitr::opts_chunk$set(echo = TRUE)
library(jsonlite) # allows us to read in json files
library(tidyverse) # allows us to do lots of data manipulation and basic data science
library(here) # allows us to cut out long file paths (ex. "users/connor/dowloads/etc")
library(forcats) # 
library(tidytext) # allows us to tokenize data 
library(dplyr) # allows us to manipulate dataframes
library(stringr) # allows us to count the number of words in a cell
library(quanteda) # allows us to tokenize data
library(quanteda.textplots) # allows us to make network plots
library(gridExtra) # allows us to combine multiple plots into 1
library(wordcloud) # allows us to generate word clouds
library(fmsb)
library(plotly) #interactive ggplot graphs
library(ggthemes) # more themes for ggplot
library(tm) #for textmining corpus(), removePunctuation()
library(syuzhet) # for sentiment analysis, getNrc()
library(wordcloud2) # for comparison clouds 
library(plotrix) # for pyramid plots
library(RColorBrewer) # for more color palettes
nature_analysis <- read_csv(here("data/training.csv"))
## Rows: 23436 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): text, claim
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#Super Claim 1 Not Happening Filter() to select super claim 1

na_1 <- nature_analysis %>%
  filter(str_detect(claim, "1_"))

Add word_count column using mutate()

na_1 <- na_1 %>% 
  mutate(word_count = str_count(na_1$text, "\\S+"))

Distribution visual, geom_histogram

ggplot(na_1, aes(x = word_count, fill = claim)) +
  geom_histogram(bins = 67, color = "black") +
  theme(text = element_text(family = "Menlo-Bold", size = 12),
        legend.title = element_text(family = "Menlo-Bold", size = 12)) +
  labs(title = "Distribution of Claims", 
       subtitle = "Claim 1")

Tokenize using unnest_tokens() to seprate text into words

na_1_tokenized <- na_1 %>% 
  unnest_tokens(words, text)

na_1_tokenized <- na_1_tokenized %>% 
  count(words) %>% 
  arrange(desc(n))

Filter() out stopwords()

na_1_tokenized <- na_1_tokenized %>% 
  filter(!words %in% stopwords("english"))

Word Cloud 1

wordcloud(na_1_tokenized$words, freq = na_1_tokenized$n, max.words = 200, min.freq = 5, random.order = FALSE, colors = c("royalblue1","seagreen2", "orangered"), family = "Avenir")

na_1_corpus <- corpus(na_1$text)

toks <- na_1_corpus %>%
    tokens(remove_punct = TRUE) %>%
    tokens_tolower() %>%
    tokens_remove(pattern = stopwords("english"), padding = FALSE)


fcmat <- fcm(toks, context = "window", tri = FALSE)

feat <- names(topfeatures(fcmat, 30))

fcm_select(fcmat, pattern = feat) %>%
    textplot_network(min_freq = 0.5)

na_1_claims <- na_1 %>% 
  select(text)

ngrams <- na_1_claims %>% 
  unnest_tokens(bigram, text, token = "ngrams", n = 2)


ngrams <- ngrams %>% 
 separate(bigram, c("word1", "word2"), sep = " ") 

ngrams <- ngrams %>%
  filter(!word1 %in% stop_words$word) %>%
      filter(!word2 %in% stop_words$word)


ngrams <- ngrams %>%
  unite(bigram, word1, word2, sep=" ")

ngrams_1 <- ngrams %>% 
  count(bigram, sort = TRUE)
na_1_claims_vector <- as.character(na_1_claims$text)
na_1_sentiment <- get_nrc_sentiment(na_1_claims_vector)
## Warning: `spread_()` was deprecated in tidyr 1.2.0.
## ℹ Please use `spread()` instead.
## ℹ The deprecated feature was likely used in the syuzhet package.
##   Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
na_1_sentiment_score <- data.frame(colSums(na_1_sentiment[,]))

names(na_1_sentiment_score) <- 'score'

na_1_sentiment_score <- cbind("sentiment" = rownames(na_1_sentiment_score), na_1_sentiment_score)

#rownames(training_sentiment_score) <- NULL

ggplot(na_1_sentiment_score, aes(x = sentiment, y = score)) +
  geom_bar(aes(fill = sentiment), stat="identity") +
  labs(x = "Sentiments", y = "Scores", title = "Sentiment for Super Claim 1")

#Super Claim 3 Climate Impacts Not Bad Filter() for super claim 3

na_3 <- nature_analysis %>%
  filter(str_detect(claim, "3_"))

Add word_count column using mutate()

na_3 <- na_3 %>% 
  mutate(word_count = str_count(na_3$text, "\\S+"))

Distribution visual, geom_histogram

ggplot(na_3, aes(x = word_count, fill = claim)) +
  geom_histogram(bins = 67, color = "black") +
  theme(text = element_text(family = "Menlo-Bold", size = 12),
        legend.title = element_text(family = "Menlo-Bold", size = 12)) +
  labs(title = "Distribution of Claims", 
       subtitle = "Claim 3")

Tokenize using unnest_tokens()

na_3_tokenized <- na_3 %>% 
  unnest_tokens(words, text)

na_3_tokenized <- na_3_tokenized %>% 
  count(words) %>% 
  arrange(desc(n))

Filter() out stopwords()

na_3_tokenized <- na_3_tokenized %>%
  anti_join(stop_words, by = c("words" = "word")) %>%
  filter(!words %in% c("et", "al", "2"))

Word Cloud 3

wordcloud(na_3_tokenized$words, freq = na_3_tokenized$n, max.words = 200, min.freq = 5, random.order = FALSE, random.color = FALSE, colors = brewer.pal(12, "Paired"))

na_3_corpus <- corpus(na_3$text)

toks <- na_3_corpus %>%
    tokens(remove_punct = TRUE) %>%
    tokens_tolower() %>%
    tokens_remove(pattern = stopwords("english"), padding = FALSE)


fcmat <- fcm(toks, context = "window", tri = FALSE)

feat <- names(topfeatures(fcmat, 30))

fcm_select(fcmat, pattern = feat) %>%
    textplot_network(min_freq = 0.5)

na_3_claims <- na_3 %>% 
  select(text)

ngrams_3 <- na_3_claims %>% 
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

ngrams_3 <- ngrams_3 %>% 
 separate(bigram, c("word1", "word2"), sep = " ")

ngrams_3 <- ngrams_3 %>% 
  filter(!word1 %in% stop_words$word) %>% 
         filter(!word2 %in% stop_words$word)

ngrams_3 <- ngrams_3 %>% 
  unite(bigrams, word1, word2, sep = " ")

ngrams_3 <- ngrams_3 %>% 
  count(bigrams, sort = TRUE)
na_3_claims_vector <- as.character(na_3_claims$text)
na_3_sentiment <- get_nrc_sentiment(na_3_claims_vector)

na_3_sentiment_score <- data.frame(colSums(na_3_sentiment[,]))

names(na_3_sentiment_score) <- 'score'

na_3_sentiment_score <- cbind("sentiment"=rownames(na_3_sentiment_score), na_3_sentiment_score)

#rownames(training_sentiment_score) <- NULL

ggplot(na_3_sentiment_score, aes(x = sentiment, y = score)) +
  geom_bar(aes(fill = sentiment), stat="identity") +
  labs(x = "Sentiments", y = "Scores", title = "Sentiment for Super Claim 3")

#Super Claim 5 Science/Scientist Not Reliable Filter() for super claim 5

na_5 <- nature_analysis %>%
  filter(str_detect(claim, "5_"))

Add word_count column using mutate()

na_5 <- na_5 %>% 
  mutate(word_count = str_count(na_5$text, "\\S+"))

Distribution visual, geom_histogram

ggplot(na_5, aes(x = word_count, fill = claim)) +
  geom_histogram(bins = 67, color = "black") +
  theme(text = element_text(family = "Menlo-Bold", size = 12),
        legend.title = element_text(family = "Menlo-Bold", size = 12)) +
  labs(title = "Distribution of Claims", 
       subtitle = "Claim 5")

Tokenize using unnest_tokens()

na_5_tokenized <- nature_analysis %>% 
  unnest_tokens(words, text)

na_5_tokenized <- na_5_tokenized %>% 
  count(words) %>% 
  arrange(desc(n))

Filter() out stopwords()

na_5_tokenized <- na_5_tokenized %>% 
  filter(!words %in% stopwords("english"))

Word Cloud 5

wordcloud(na_5_tokenized$words, freq = na_5_tokenized$n, max.words = 200, min.freq = 5, random.order = FALSE, random.color = FALSE, color = brewer.pal(12, "Paired"))

na_5_corpus <- corpus(na_5$text)

toks <- na_5_corpus %>%
    tokens(remove_punct = TRUE) %>%
    tokens_tolower() %>%
    tokens_remove(pattern = stopwords("english"), padding = FALSE)


fcmat <- fcm(toks, context = "window", tri = FALSE)

feat <- names(topfeatures(fcmat, 30))

fcm_select(fcmat, pattern = feat) %>%
    textplot_network(min_freq = 0.5)

na_5_claims <- na_5 %>% 
  select(text)

ngrams_5 <- na_5_claims %>% 
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

ngrams_5 <- ngrams_5 %>% 
  separate(bigram, c("word1", "word2"), sep = " ")

ngrams_5 <- ngrams_5 %>% 
  filter(!word1 %in% stop_words$word) %>% 
  filter(!word2 %in% stop_words$word)

ngrams_5 <- ngrams_5 %>% 
  unite(bigram, word1, word2, sep = " ")

ngrams_5 <- ngrams_5 %>% 
  count(bigram, sort = TRUE)
na_5_claims_vector <- as.character(na_5_claims$text)
na_5_claims_sentiment <- get_nrc_sentiment(na_5_claims_vector)

na_5_claims_sentiment_score <- data.frame(colSums(na_5_claims_sentiment[,]))

names(na_5_claims_sentiment_score) <- 'score'

na_5_claims_sentiment_score <- cbind("sentiment" = rownames(na_5_claims_sentiment_score), na_5_claims_sentiment_score)

#rownames(training_sentiment_score) <- NULL

ggplot(na_5_claims_sentiment_score, aes(sentiment, score)) +
  geom_bar(aes(fill = sentiment), stat = "identity") +
  labs(x = "Sentiment", y = "Score", title = "Sentiment for Super Claim 5")

#Super Claim 5_1 Filter() for super claim 5

na_5_1 <- nature_analysis %>%
  filter(str_detect(claim, "5_1"))

Add word_count column using mutate()

na_5_1 <- na_5_1 %>% 
  mutate(word_count = str_count(na_5_1$text, "\\S+"))

Tokenize using unnest_tokens()

na_5_1_tokenized <- nature_analysis %>% 
  unnest_tokens(words, text)

na_5_1_tokenized <- na_5_1_tokenized %>% 
  count(words) %>% 
  arrange(desc(n))

Filter() out stopwords()

na_5_1_tokenized <- na_5_1_tokenized %>% 
  filter(!words %in% stopwords("english"))
wordcloud(na_5_1_tokenized$words, freq = na_5_1_tokenized$n, max.words = 200, min.freq = 5, random.order = FALSE, random.color = FALSE, color = brewer.pal(12, "Paired"))

na_5_1_corpus <- corpus(na_5_1$text)

toks <- na_5_1_corpus %>%
    tokens(remove_punct = TRUE) %>%
    tokens_tolower() %>%
    tokens_remove(pattern = stopwords("english"), padding = FALSE)


fcmat <- fcm(toks, context = "window", tri = FALSE)

feat <- names(topfeatures(fcmat, 30))

fcm_select(fcmat, pattern = feat) %>%
    textplot_network(min_freq = 0.5)

na_5_1_claims <- na_5_1 %>% 
  select(text)

ngrams_5_1 <- na_5_1_claims %>% 
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

ngrams_5_1 <- ngrams_5_1 %>% 
  separate(bigram, c("word1", "word2"), sep = " ")

ngrams_5_1 <- ngrams_5_1 %>% 
  filter(!word1 %in% stop_words$word) %>% 
  filter(!word2 %in% stop_words$word)

ngrams_5_1 <- ngrams_5_1 %>% 
  unite(bigram, word1, word2, sep = " ")

ngrams_5_1 <- ngrams_5_1 %>% 
  count(bigram, sort = TRUE)
na_5_1_claims_vector <- as.character(na_5_1_claims$text)
na_5_1_claims_sentiment <- get_nrc_sentiment(na_5_1_claims_vector)

na_5_1_claims_sentiment_score <- data.frame(colSums(na_5_1_claims_sentiment[,]))

names(na_5_1_claims_sentiment_score) <- 'score'

na_5_1_claims_sentiment_score <- cbind("sentiment" = rownames(na_5_1_claims_sentiment_score), na_5_1_claims_sentiment_score)

#rownames(training_sentiment_score) <- NULL

ggplot(na_5_1_claims_sentiment_score, aes(sentiment, score)) +
  geom_bar(aes(fill = sentiment), stat = "identity") +
  labs(x = "Sentiment", y = "Score", title = "Sentiment for Super Claim 5_1")

#Super Claim 5_2 Filter() for super claim 5

na_5_2 <- nature_analysis %>%
  filter(str_detect(claim, "5_2"))

Add word_count column using mutate()

na_5_2 <- na_5_2 %>% 
  mutate(word_count = str_count(na_5_2$text, "\\S+"))

Tokenize using unnest_tokens()

na_5_2_tokenized <- nature_analysis %>% 
  unnest_tokens(words, text)

na_5_2_tokenized <- na_5_2_tokenized %>% 
  count(words) %>% 
  arrange(desc(n))

Filter() out stopwords()

na_5_2_tokenized <- na_5_2_tokenized %>% 
  filter(!words %in% stopwords("english"))
wordcloud(na_5_2_tokenized$words, freq = na_5_2_tokenized$n, max.words = 200, min.freq = 5, random.order = FALSE, random.color = FALSE, color = brewer.pal(12, "Paired"))

na_5_2_corpus <- corpus(na_5_2$text)

toks <- na_5_2_corpus %>%
    tokens(remove_punct = TRUE) %>%
    tokens_tolower() %>%
    tokens_remove(pattern = stopwords("english"), padding = FALSE)


fcmat <- fcm(toks, context = "window", tri = FALSE)

feat <- names(topfeatures(fcmat, 30))

fcm_select(fcmat, pattern = feat) %>%
    textplot_network(min_freq = 0.5)

na_5_2_claims <- na_5_2 %>% 
  select(text)

ngrams_5_2 <- na_5_2_claims %>% 
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

ngrams_5_2 <- ngrams_5_2 %>% 
  separate(bigram, c("word1", "word2"), sep = " ")

ngrams_5_2 <- ngrams_5_2 %>% 
  filter(!word1 %in% stop_words$word) %>% 
  filter(!word2 %in% stop_words$word)

ngrams_5_2 <- ngrams_5_2 %>% 
  unite(bigram, word1, word2, sep = " ")

ngrams_5_2 <- ngrams_5_2 %>% 
  count(bigram, sort = TRUE)
na_5_2_claims_vector <- as.character(na_5_2_claims$text)
na_5_2_claims_sentiment <- get_nrc_sentiment(na_5_2_claims_vector)

na_5_2_claims_sentiment_score <- data.frame(colSums(na_5_2_claims_sentiment[,]))

names(na_5_2_claims_sentiment_score) <- 'score'

na_5_2_claims_sentiment_score <- cbind("sentiment" = rownames(na_5_2_claims_sentiment_score), na_5_2_claims_sentiment_score)

#rownames(training_sentiment_score) <- NULL

ggplot(na_5_2_claims_sentiment_score, aes(sentiment, score)) +
  geom_bar(aes(fill = sentiment), stat = "identity") +
  labs(x = "Sentiment", y = "Score", title = "Sentiment for Super Claim 5_2")

#GGPPLOT comparisons

united_sent_score <- full_join(na_1_sentiment_score, na_3_sentiment_score, by = "sentiment") %>% 
  full_join(na_5_claims_sentiment_score, by = "sentiment") %>% rename(claim_1 = score.x, claim_3 = score.y, claim_5 = score) #%>% 
 #transmute(sentiment, score = score.x + score.y + score)
ggplot(united_sent_score, aes(x = sentiment)) +
  geom_col(aes(y = claim_5, fill = "claim_5"), position = "stack") +
  geom_col(aes(y = claim_1, fill = "claim_1"), position = "stack") +
  geom_col(aes(y = claim_3, fill = "claim_3"), position = "stack") +
  #coord_flip() +
  #theme_wsj() +
  theme_minimal()+
  scale_fill_manual(values = c("claim_5" = "#08589E", "claim_1" = "#4EB3D3", "claim_3" = "#A8DDB5")) +
  #scale_fill_manual(values = c("claim_5" = "#FF7F0E", "claim_1" = "#2CA02C", "claim_3" = "#1F77B4")) +
  theme(text = element_text(family = "Arial", size = 21), axis.text.y = element_text(family = "Arial", size = 21), axis.title = element_text(family = "Arial", size = 30), plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))+
  labs(x = "Sentiment", y = "Score", title = "Sentiment Scores", subtitle = "Super Claims 1, 3, & 5", fill = "Claim")

#Word Cloud

wordcloud(ngrams_1$bigram, freq = ngrams_1$n, max.words = 200, min.freq = 5, random.order = FALSE, colors = c("royalblue1","seagreen2", "orangered"), family = "Avenir")

wordcloud(ngrams_3$bigrams, freq = ngrams_3$n, max.words = 200, min.freq = 5, random.order = FALSE, colors = c("royalblue1","seagreen2", "orangered"), family = "Avenir")

wordcloud(ngrams_5$bigram, freq = ngrams_5$n, max.words = 200, min.freq = 5, random.order = FALSE, color = brewer.pal(8, "Spectral"), family = "Avenir")

wordcloud(ngrams_5_1$bigram, freq = na_5_1_tokenized$n, max.words = 200, min.freq = 5, random.order = FALSE, color = brewer.pal(8, "Dark2"), family = "Avenir")

wordcloud(ngrams_5_2$bigram, freq = na_5_2_tokenized$n, max.words = 200, min.freq = 5, random.order = FALSE, color = brewer.pal(8, "Set1"), family = "Avenir")

#Bigram Frequencey Correlation

ngrams_3_corpus <- corpus(ngrams_3$bigrams)

toks <- na_3_corpus %>%
    tokens(remove_punct = TRUE) %>%
    tokens_tolower() %>%
    tokens_remove(pattern = stopwords("english"), padding = FALSE)


fcmat <- fcm(toks, context = "window", tri = FALSE)

feat <- names(topfeatures(fcmat, 30))

fcm_select(fcmat, pattern = feat) %>%
    textplot_network(min_freq = 0.5)

#Pyramid Plot

pyramid.plot(na_1_sentiment_score$score, na_3_sentiment_score$score, labels = na_1_sentiment_score$sentiment, main = "Sentiment Comparison for Claims 1 & 3", gap = 300, top.labels = c("Super Claim 1: CC is Not Happening", "Sentiment", "Super Claim 3: CC is Not Bad"), show.values = TRUE, unit = "Score", ppmar = c(8,4,8,4))

## 3363 3363
## [1] 5.1 4.1 4.1 2.1
pyramid.plot(na_1_sentiment_score$score, na_5_claims_sentiment_score$score, labels = na_1_sentiment_score$sentiment, main = "Sentiment Comparison for Claims 1 and 5", top.labels = c("Super Claim 1: CC is Not Happening", "Sentiment", "Super Claim 5: Science/Scientist Not Reliable"), gap = 450, show.values = TRUE, unit = "Score", ppmar = c(8,4,8,4))

## 6254 6254
## [1] 5.1 4.1 4.1 2.1
pyramid.plot(na_3_sentiment_score$score, na_5_claims_sentiment_score$score, labels = na_3_sentiment_score$sentiment, main = "Sentiment Comparison for Claims 3 and 5", top.labels = c("Super Claim 3: CC is Not Bad", "Sentiment", "Super Claim 5: Science/Scientist Not Reliable"), gap = 500, show.values = TRUE, unit = "Score", ppmar = c(8,4,8,4))

## 6254 6254
## [1] 5.1 4.1 4.1 2.1

right_join() to combine sentiment scores for super-claims 1 and 3 select() relevant columns to rename() transmute() to select “Sentiment” column and creat a new column named “Score” that is the sums of Score_1 + Score_3

sentiment_1_3 <- right_join(na_1_sentiment_score, na_3_sentiment_score, by = "sentiment", keep = TRUE)

sentiment_1_3 <- sentiment_1_3 %>% 
  select(sentiment.x, score.x, score.y) %>% 
  rename(Sentiment = sentiment.x, Score_1 = score.x, Score_3 = score.y) %>%
  ungroup() %>% 
 transmute(Sentiment, Score = Score_1 + Score_3)
pyramid.plot(sentiment_1_3$Score, na_5_claims_sentiment_score$score, labels = na_5_claims_sentiment_score$sentiment, main = "Sentiment Comparison for Combined Claims 1 + 3 & 5", top.labels = c("Super Claim 1 & 3", "Sentiment", "Super Claim 5: Science/Scientist Not Reliable"), gap = 500, show.values = TRUE, unit = "Score", ppmar = c(8,4,8,4))

## 6254 6254
## [1] 5.1 4.1 4.1 2.1
pyramid.plot(na_5_1_claims_sentiment_score$score, na_5_2_claims_sentiment_score$score, labels = na_5_1_claims_sentiment_score$sentiment, main = "Sentiment Comparison for Super Claim 5", top.labels = c("Super Claim 5_1:Science is Unreliable", "Sentiment", "Super Claim 5_2:Movement is Unreliable"), gap = 450, show.values = TRUE, unit = "Score", ppmar = c(8,4,8,4))

## 3525 3525
## [1] 5.1 4.1 4.1 2.1

#Comparison Cloud Plot

#Word Associate Plot////NEEDS JAVA

#Other